 cd "D:\Dropbox\Restat\"
log using Weisburd_Restat_Data_Clean,replace

/************************************************************************************************************************

Program: Weisburd_Restat_Data_Clean.do

Description: This program combines data files of employees with private and company coverage
             with their claims data as well as additional characteristics based on place of residence

Notes: 

1. This data is classified by client_num- since this is unique for each driver.                                           
We also have data available on policy but this is more complex to decipher since                                       
policys for the same car can change over time due to extending a contract etc.   

2. Dealing with overlap (the same client holding 2 policies during the same period):

   a. We deal with overlap in private clients by creating a unique id based on client num and license  
   b. We deal with overlap in company clients by creating a unique id based on client num and car model (license is not available) 
                             
3. If a new car is purchased - we cannot combine the shortened policies with previous policies - as this car 
has a new policy date (and will be seperately affected by accident costs).

4. As client numbers are proprietary data - any datasets with this information can only be accessed upon
   direct request to the author (saritw@post.tau.ac.il). These identifiers are removed from the final output data
   created in this file: Weisburd_Restat_Collisions.dta

Output file: weisburd_restat.dta														  
*************************************************************************************************************************/

clear all
clear matrix
set memory 500m

/*loading data for matching*/

*this dataset allows us to define individuals who are new to the insurance firm
insheet using company_renew_action.csv, clear
  gen s_date=date(start_date,"DMY")
  gen e_date=date(end_date,"DMY")
  format s_date %td
  format e_date %td
  sort client_num s_date num_action
  drop start_date end_date
  rename s_date start_date
  rename policy_num policy
  rename e_date end_date
  sort client_num policy start_date end_date
  keep if action=="new"
  drop num_action
save company_new, replace
disp _N

*this includes data on annual premiums paid by individuals with private coverage
insheet using premiums_policy.csv, clear
  gen s_date=date(start_date,"DMY")
  gen e_date=date(end_date,"DMY")
  format s_date %td
  format e_date %td
  sort client_num s_date num_action
  drop start_date end_date
  rename s_date start_date
  rename policy_num policy
  rename e_date end_date
  sort client_num policy start_date end_date premium num_action  
  *these observations don't provide any additional information
  drop if premium==0
  gen problem=0
  replace problem=1 if client_num==client_num[_n-1] & start_date==start_date[_n-1] /*
    */ & end_date==end_date[_n-1] & premium+premium[_n-1]==0
  drop if problem[_n+1]==1
  drop if problem==1
  tab action
  gen addition=0
  gen cancel=0
  gen new=0
  gen renew=0
  replace addition=1 if action=="addition"
  replace cancel=1 if action=="cancel"
  replace new=1 if action=="new"
  replace renew=1 if action=="renew"
  collapse (sum) premium addition cancel new renew, by(client_num policy start_date end_date)
  sort client_num policy start_date end_date
save premiums, replace

*translating names to gender
insheet using names.csv,clear
  keep name sex
  sort name
save names,replace

*translating cities to english
insheet using cities.csv,names clear
  keep city english_city
  sort english_city
save cities,replace

insheet using avg_inc.csv,names clear
  summarize avg_inc*
  sort english_city
  merge english_city using cities
  keep if _merge==3
  sort english_city
  drop _merge
save cities1,replace

insheet using bagrut.csv,names clear
  sort english_city
  merge english_city using cities1
  tab _merge
  drop if _merge==1
  sort english_city
  drop _merge
save city_chars,replace

insheet using distance_from_work.csv,names clear
  sort english_city
save distances,replace

insheet using cities_direct.csv,names clear
  sort english_city
  merge english_city using city_chars
  tab _merge
  sort english_city
  drop _merge
  merge english_city using distances
  tab _merge
  sort city
  drop _merge
save city_chars2,replace

*distance of accident from place of employment
insheet using accident_location.csv,names clear
  rename accident_loc english_city
  sort english_city 
  merge english_city using distances
  drop if _merge==2
  replace english_city="home" if english_city==""
  rename distance accident_distance
  drop _merge
  sort location english_city
  drop if location==location[_n-1] & english_city==english_city[_n-1]
  *language issues with stata
  drop if location=="�" & english_city=="Bene Ayish"
save accident_loc,replace

insheet using accident_location2.csv,names clear
  rename accident_loc english_city
  sort english_city 
  merge m:1 english_city using distances
  drop if _merge==2
  replace english_city="home" if english_city==""
  rename distance accident_distance
  drop _merge
  sort location english_city
  drop if location==location[_n-1] & english_city==english_city[_n-1]
  *language issues with stata
  drop if location=="�" & english_city=="Rehovot"
  drop if location=="�" & english_city=="Rehovot"
  drop if location=="�" & english_city=="Nes Ziyyona"
  drop if location=="�" & english_city=="Ramat Hasharon"
save accident_loc2,replace

*distance of accident from place of residence
insheet using residence_accidents.csv, clear
  drop if accident_city==""
  sort english_city accident_city
save residence_accidents,replace 


/*Group within company with private insurance and cars*/

/*Clients are grouped by client_num unless there is an overlap with different cars over the same period
they are then grouped by client_num & license - b/c it implies there were 2 drivers involved*/

insheet using private_policies.csv,clear

  summarize

  *Arranging dates
  gen s_date=date(start_date,"MDY")
  format s_date %d
  gen e_date=date(end_date,"MDY")
  format e_date %d
  drop start_date end_date
  rename s_date start_date
  rename e_date end_date

  /*DATA CLEANING*/

  *These add ons are irrelevent and can be dropped
  *lets only keep cars with full insurance
  keep if insurance==1
  disp _N
  sort license model start_date end_date policy client_num
   
  *all of thse are simply doubles but the policy number is sometimes different
  *which is why we cannot do merging on policy - need to be careful about which policy is 
  *dropped because this data is merged with information on new policies
  drop if license==license[_n-1] & model==model[_n-1] /*
    */ & start_date==start_date[_n-1] & end_date==end_date[_n-1] /*
	*/ & client_num!=300911 & client_num!=500025 & client_num!=1900250 & client_num!=300080
	
  gsort license model start_date end_date -policy client_num
  drop if license==license[_n-1] & model==model[_n-1] /*
    */ & start_date==start_date[_n-1] & end_date==end_date[_n-1] 
	

  disp _N
  *first of all - there is a problematic policy only one month with different clientnum
  *let's delete it
  drop if client_num==300137
 
  *We need to deal with people who have the same car but extended their policy
  *Set the end date of their current policy to the new policy
  *We can change the end date for people who simply extended their policy
  egen policy_life=group(license city)

  sort client_num policy start_date end_date
  *these premiums are pro-rated according to the length of the contract 
  *computing relevent annual premium paid per policy
  merge 1:1 client_num policy start_date end_date using premiums
  tab _merge
  replace new=0 if new==.
  *this policy had a renew before the date of the "new" policy
  replace new=0 if client_num==300135
 
  egen check=group(client_num policy end_date)
  by check,sort: egen tot_premium=sum(premium)
  drop if _merge==2 
  drop _merge
  disp _N

  *this is the day I received the data
  replace end_date=date("24 april 2008", "DMY") if end_date>date("24 april 2008", "DMY") 
  replace tot_premium=tot_premium*(end_date-start_date)/365.2 if end_date==date("24 april 2008", "DMY") 
  summarize tot_premium
  
  *these are cars insured for one year seperately from others insured
  drop if policy_life==463 | policy_life==418 | policy_life==532 | policy_life==335
  drop if policy_life==393 | policy_life==493 | policy_life==343 | policy_life==475
  drop if policy_life==322 | policy_life==132 | policy_life==86 | policy_life==316 | policy_life==311
  drop if license==7532601 | license==3703704 | license==2515509 | license==6194156 | /*
    */ license==6371061 | license==7487005 | license==8877710 | license==7056763 | /*
    */ license==2534510 | license==5166820 | license==334119
   
  *this is a car that the same owner insured for a short overlap
  drop if license==5280505 & start_date>date("1 september 2001", "DMY")
  *this policy looks strange - it is divided into 3 1 month segments
  drop if license==8316903

  *these clients have multiple cars
  replace client_num=3000230 if policy_life==3
  replace client_num=3001860 if policy_life==92 | policy_life==226
  replace client_num=3004460 if policy_life==27 | policy_life==375 | policy_life==409
  replace client_num=3004461 if policy_life==271 
  replace client_num=3005070 if policy_life==515 | policy_life==534 
  replace client_num=3005071 if policy_life==430 
  replace client_num=3005600 if policy_life==1
  replace client_num=3006820 if policy_life==81 
  replace client_num=3009360 if policy_life==456
  replace client_num=5000350 if policy_life==506
  replace client_num=19001170 if policy_life==238
  replace client_num=19001830 if policy_life==216
  replace client_num=19001831 if policy_life==426
  replace client_num=3000330 if policy_life==387
  replace client_num=3000800 if policy_life==354 | policy_life==21
  replace client_num=3002680 if policy_life==511
  replace client_num=3003350 if policy_life==185
  replace client_num=3003420 if policy_life==249
  replace client_num=3003780 if policy_life==364 | policy_life==85
  replace client_num=3003781 if policy_life==244 | policy_life==23
  replace client_num=3003830 if policy_life==413
  replace client_num=3003920 if policy_life==80 | policy_life==552
  replace client_num=3005320 if policy_life==94
  replace client_num=3005790 if policy_life==405
  replace client_num=3006800 if policy_life==477
  replace client_num=3009330 if policy_life==69
  replace client_num=3010720 if policy_life==151
  replace client_num=3012680 if policy_life==217
  replace client_num=3012730 if policy_life==99
  replace client_num=4001620 if policy_life==557
  replace client_num=5000050 if policy_life==424
  replace client_num=5000700 if policy_life==170 | policy_life==252
  replace client_num=19000800 if policy_life==272
  replace client_num=19000840 if policy_life==330
  replace client_num=19001850 if policy_life==390
  replace client_num=3000240 if policy==103866
  replace client_num=3002090 if policy==204329
  replace client_num=3003410 if policy==213834
  replace client_num=3004180 if policy==223082
  replace client_num=3005080 if policy==220384
  replace client_num=3005730 if policy==200208
  replace client_num=3006460 if policy==219461
  replace client_num=3007030 if policy==215069
  replace client_num=3008580 if policy==216781
  replace client_num=3008640 if policy==219054
  replace client_num=5000740 if policy==213529
  replace client_num=19001310 if policy==217058

  *these are problematic policies whose end date needs to be adjusted
  replace end_date=15566 if policy_life==306
  replace end_date=15616 if license==6777007 & start_date==15553
  replace start_date=16284 if policy==218021
  replace end_date=16503 if policy==218021
  replace end_date=15391 if policy==206744
  replace end_date=15866 if policy==214287
  replace end_date=15455 if policy==212095
  replace end_date=15634 if policy==206893
  replace end_date=15943 if license==9553316 & start_date==15706
  replace end_date=16866 if license==6087329 & start_date==16618
  replace end_date=16769 if license==2782523 & start_date==16468
  replace end_date=17326 if license==9402819 & start_date==17106
  replace end_date=17499 if license==9689756 & start_date==17226
  replace end_date=15411 if policy==208072
  replace end_date=17140 if license==7195135 & start_date==17045
  replace end_date=17000 if license==5060904 & start_date==16680
  replace end_date=17627 if license==4058514 & start_date==17410
  replace end_date=mdy(12,30,2004) if license==1980650 & start_date==mdy(9,1,2004)
  replace end_date=mdy(12,30,2002) if license==7750505 & start_date==mdy(2,1,2002)
  replace end_date=mdy(6,19,2005) if license==5323108 & start_date==mdy(9,1,2004)
  replace end_date=mdy(1,8,2002) if license==1163403 & start_date==mdy(10,30,2001)
  replace end_date=mdy(3,5,2005) if license==3171708 & start_date==mdy(1,1,2005)
  
  *If a client is holding one policy with an overlap then
  *we correct the date (assuming he let someone else drive the car for that period)
  sort client_num start_date end_date policy_life 
  quietly by client_num start_date end_date:  gen dup = cond(_N==1,0,_n)
     
  gen overlap=0
  replace overlap=1 if client_num==client_num[_n-1] & start_date<end_date[_n-1] 
  by client_num, sort: egen tot_lap=sum(overlap)
  tab tot_lap
  replace end_date=start_date[_n+1]-1 if overlap[_n+1]==1 & tot_lap==1 & dup==0
  replace tot_premium=tot_premium*(end_date-start_date)/365.2 if tot_lap>0
  
  *we drop these overlaps
  drop if start_date>end_date
  drop if client_num==300110 & policy==208717 & start_date==mdy(9,1,2004)
  drop if client_num==300214 & policy==203341 & start_date==mdy(2,1,2002)
 

  gen policy_length=(end_date-start_date)/365.2
  summarize policy_length,d

  disp _N
  
  gsort policy_life start_date -end_date
  
  *If we have a policy that is fully included in the one before we should drop it
  drop if policy_life==policy_life[_n-1] & start_date>=start_date[_n-1] & end_date<=end_date[_n-1]

  *We will try to combine short policies
  by policy_life, sort:egen short_life=min(policy_length)
  by policy_life,sort: egen ttlpolicy=count(license)

  sort client_num policy_life start_date end_date policy 
  
save all,replace
disp _N
use all,clear

  drop dup
  summarize policy_length
  keep if short_life<0.5 & ttlpolicy>1
  disp _N

  *First policy's with 2 phases lets combine to 1
  by policy_life, sort:egen policy_start=min(start_date)
  by policy_life, sort:egen policy_end=max(end_date)
  by policy_life, sort:egen premium_sum=sum(tot_premium)

  summarize policy_length
  sort policy_life start_date policy 
  
  replace start_date=policy_start if ttlpolicy==2 & (policy_end-policy_start)/365.2 < 1.5 
  replace end_date=policy_end if ttlpolicy==2 & (policy_end-policy_start)/365.2 < 1.5 
  replace policy_length=(end_date-start_date)/365.2 
  summarize policy_length

  replace tot_premium=premium_sum if ttlpolicy==2
  summarize policy_length
  
  gen counter=1
  replace counter=counter[_n-1]+1 if policy_life==policy_life[_n-1]

  *there are 2 problematic policies that needs to be dropped (added a month after years break)
  drop if client_num==301306 & start_date==17418 & end_date==17448
  drop if client_num==300441 & start_date==17028 & end_date==17059

  /*we are only combining first 2 or last 2 policies otherwise needs to be done manually*/
  sort client_num start_date license end_date policy
    
  *set your start_date to previous start_date if you are second or last
  replace start_date=start_date[_n-1] if client_num==client_num[_n-1] & license==license[_n-1] /*
    */ & ttlpolicy>2 & (policy_length==short_life | policy_length[_n-1]==short_life) /*
    */ & (counter==2 | counter==ttlpolicy) 
  *set your end_date to next end_date if you are first or 1 before last  
  replace end_date=end_date[_n+1] if client_num==client_num[_n+1] & license==license[_n+1] &/*
    */ ttlpolicy>2 & (policy_length==short_life | policy_length[_n+1]==short_life)/*
    */ & (counter==1 | counter==ttlpolicy-1) 
  replace tot_premium=tot_premium+tot_premium[_n+1] if client_num==client_num[_n+1] & license==license[_n+1] & /*
    */ ttlpolicy>2 & (policy_length==short_life | policy_length[_n+1]==short_life)/*
    */ & (counter==1 | counter==ttlpolicy-1) 
  replace tot_premium=tot_premium+tot_premium[_n-1] if client_num==client_num[_n-1] & license==license[_n-1]& /*
    */ ttlpolicy>2 & (policy_length==short_life | policy_length[_n-1]==short_life) /*
    */ & (counter==2 | counter==ttlpolicy) 

  replace start_date=start_date[_n-1] if policy_life==policy_life[_n-1] & /*
    */ ttlpolicy>2 & policy_length[_n-1]==short_life & counter==4  
  *set your end_date to next end_date if you are first or 1 before last  
  replace end_date=end_date[_n+1] if policy_life==policy_life[_n+1] & /*
    */ ttlpolicy>2 & policy_length==short_life & counter==3 & counter<ttlpolicy 
  replace tot_premium=tot_premium+tot_premium[_n+1] if policy_life==policy_life[_n+1] & /*
    */ ttlpolicy>2 & policy_length==short_life & counter==3 & counter<ttlpolicy 
  replace tot_premium=tot_premium+tot_premium[_n-1] if policy_life==policy_life[_n-1] & /*
    */ ttlpolicy>2 & policy_length[_n-1]==short_life & counter==4 
  
  replace policy_length=(end_date-start_date)/365.2
  summarize policy_length
  drop short_life
  by policy_life, sort:egen short_life=min(policy_length)
 
  *Let's clean up what we fixed
  sort license model start_date end_date policy
  replace new=1 if policy_life==policy_life[_n+1] & start_date==start_date[_n+1] & end_date==end_date[_n+1] & new[_n+1]==1
  replace new=1 if policy_life==policy_life[_n-1] & start_date==start_date[_n-1] & end_date==end_date[_n-1] & new[_n-1]==1
   
  *these are duplicates
  drop if client_num==300033 & start_date==mdy(2,1,2007) & check==68
  drop if client_num==300076 & start_date==mdy(1,1,2007) & check==90
  drop if client_num==300080 & start_date==mdy(1,1,2007) & check==115
  drop if client_num==300110 & start_date==mdy(9,1,2004) & check==168
  drop if client_num==300114 & start_date==mdy(11,1,2006) & check==185
  drop if client_num==300124 & start_date==mdy(9,1,2004) & check==198
  drop if client_num==300128 & start_date==mdy(4,1,2007) & check==224
  drop if client_num==300144 & start_date==mdy(12,1,2006) & check==262
  drop if client_num==300193 & start_date==mdy(1,1,2007) & check==327
  drop if client_num==300214 & start_date==mdy(1,1,2007) & check==362
  drop if client_num==300259 & start_date==mdy(12,1,2006) & check==417
  drop if client_num==300262 & start_date==mdy(11,1,2006) & check==425
  drop if client_num==300291 & start_date==mdy(3,1,2007) & check==460
  drop if client_num==300317 & start_date==mdy(8,1,2006) & check==513
  drop if client_num==300372 & start_date==mdy(9,6,2007) & check==572
  drop if client_num==300383 & start_date==mdy(1,1,2007) & check==611
  drop if client_num==300393 & start_date==mdy(6,29,2003) & check==643
  drop if client_num==300427 & start_date==mdy(4,11,2002) & check==666
  drop if client_num==300435 & start_date==mdy(3,4,2003) & check==683
  drop if client_num==300444 & start_date==mdy(12,1,2006) & check==702
  drop if client_num==300478 & start_date==mdy(12,1,2006) & check==756
  drop if client_num==300528 & start_date==mdy(4,1,2007) & check==826
  drop if client_num==300568 & start_date==mdy(11,1,2006) & check==893
  drop if client_num==300637 & start_date==mdy(8,25,2005) & check==961
  drop if client_num==300640 & start_date==mdy(3,20,2008) & check==966
  drop if client_num==300642 & start_date==mdy(11,13,2003) & check==968
  drop if client_num==300677 & start_date==mdy(9,1,2007) & check==1012
  drop if client_num==300683 & start_date==mdy(5,16,2007) & check==1029
  drop if client_num==300787 & start_date==mdy(5,28,2002) & check==1166
  drop if client_num==300811 & start_date==mdy(11,20,2001) & check==1202
  drop if client_num==300838 & start_date==mdy(10,4,2002) & check==1228
  drop if client_num==300873 & start_date==mdy(10,30,2001) & check==1274
  drop if client_num==300911 & start_date==mdy(4,1,2007) & check==1333
  drop if client_num==300932 & start_date==mdy(1,1,2007) & check==1359
  drop if client_num==300933 & start_date==mdy(3,7,2006) & check==1375
  drop if client_num==301033 & start_date==mdy(1,1,2007) & check==1489
  drop if client_num==301072 & start_date==mdy(11,1,2006) & check==1519
  drop if client_num==301234 & start_date==mdy(11,1,2006) & check==1609
  drop if client_num==301262 & start_date==mdy(12,1,2005) & check==1640
  drop if client_num==301267 & start_date==mdy(1,1,2007) & check==1656
  drop if client_num==301273 & start_date==mdy(12,1,2006) & check==1684
  drop if client_num==301339 & start_date==mdy(5,1,2005) & check==1760
  drop if client_num==500013 & start_date==mdy(1,1,2007) & check==1822
  drop if client_num==500042 & start_date==mdy(12,1,2006) & check==1866
  drop if client_num==500070 & start_date==mdy(3,1,2007) & check==1875
  drop if client_num==500072 & start_date==mdy(3,1,2007) & check==1888
  drop if client_num==1900084 & start_date==mdy(12,1,2006) & check==1957
  drop if client_num==1900129 & start_date==mdy(5,1,2003) & check==2014
  drop if client_num==1900154 & start_date==mdy(7,3,2001) & check==2031
  drop if client_num==1900185 & start_date==mdy(1,1,2007) & check==2052
  drop if client_num==1900254 & start_date==mdy(9,1,2007) & check==2129
  drop if client_num==3000800 & start_date==mdy(9,1,2005) & check==108
  drop if client_num==3003830 & start_date==mdy(11,1,2006) & check==613
  drop if client_num==3003920 & start_date==mdy(11,1,2006) & check==634
  drop if client_num==3004460 & start_date==mdy(9,1,2001) & check==727
  drop if client_num==4700014 & start_date==mdy(3,1,2007) & check==2210
  drop if client_num==4700017 & start_date==mdy(4,1,2007) & check==2219
  drop if client_num==4700018 & start_date==mdy(4,1,2007) & check==2223
  drop if client_num==4700038 & start_date==mdy(2,1,2007) & check==2250
  drop if client_num==4700039 & start_date==mdy(2,1,2007) & check==2254
  drop if client_num==4700061 & start_date==mdy(11,12,2007) & check==2284
  drop if client_num==19000800 & start_date==mdy(10,20,2004) & check==1940
  drop if client_num==19001850 & start_date==mdy(11,1,2006) & check==2055
  
  sort policy_life start_date end_date premium
    
  drop if policy_life==policy_life[_n-1] & start_date==start_date[_n-1] & end_date==end_date[_n-1]
  drop ttlpolicy 
  by policy_life,sort: egen ttlpolicy=count(license)
  summarize policy_length
  sort license model start_date end_date policy

save clean1,replace

*Now we have a clean panel data of policies - must combine
use all,clear

  drop if short_life<0.5 & ttlpolicy>1
  append using clean1
  sort policy_life start_date
  drop policy_start policy_end
  disp _N
   
  sort license model start_date end_date policy 
save panel_private,replace

use panel_private,clear

  disp _N
  
  *phase 1 lets check if policy owners are male or female
  rename name1 name
  sort name
  merge name using names
  tab _merge
  keep if _merge==3
  drop name _merge
  rename sex owner_sex
  rename name2 name
  sort name
  merge name using names
  drop if _merge==2
  tab _merge
  drop name _merge
  rename sex driver2_sex
  rename name3 name
  sort name
  merge name using names
  drop if _merge==2
  tab _merge
  drop name _merge
  rename sex driver3_sex

  sort city
  merge city using city_chars2
  tab _merge
  *Drops policy that is missing too much info
  keep if _merge==3
  drop _merge

  sort license english_city insurance start_date
  gen accident=0
  gen id=_n
  sort license id

save private_panel_insurance,replace
disp _N

*Classifies accident type: Collision if class==1
insheet using classification.csv,names clear
  sort file_num client_num policy license accident_date
  drop start_date end_date
save class,replace

insheet using private_claims.csv,names clear

  drop damage_code damage_estimate
  sort file_num client_num policy license accident_date
  merge file_num client_num policy license accident_date using class
  keep if _merge==3
  drop _merge
  disp _N
  sort location
  merge m:1 location using accident_loc
  tab _merge
  drop if _merge==2
  *this was a stata language matching issue
  replace english_city="Bene Ayish" if file_num==10009904 
  drop _merge 
  rename english_city accident_city
  rename start_date sdate_check
  rename end_date edate_check
  sort license 
  gen a_date=date(accident_date,"MDY")
  format a_date %d
  drop accident_date
  rename a_date accident_date
  gen counter=1
  keep if class==1
  tab damage_code
  replace counter=counter[_n-1]+1 if license==license[_n-1]
  egen max_accident=max(counter)
  sort license
  
save private_claims,replace

use private_claims,clear

local stopper=max_accident[_N]
disp "`stopper'"
foreach x of numlist 1/`stopper' {
  use private_claims,clear
    keep if counter==`x'
    drop counter max_accident
    sort license
    merge 1:m license using private_panel_insurance
    keep if _merge==3
    replace accident=1 if (accident_date>=start_date & accident_date<=end_date)
    keep if accident==1
    sort license
    drop _merge
  save private_panel_insurance`x',replace
  disp _N
}

*this is all the private accident data
use private_panel_insurance1,clear
  append using private_panel_insurance2
  append using private_panel_insurance3
  append using private_panel_insurance4
  append using private_panel_insurance5
  append using private_panel_insurance6
  append using private_panel_insurance7

  *now we must merge with the source
  sort license id
  keep license id accident_date damage description sdate_check edate_check damage_code damage_estimate /*
    */ location accident_city accident_distance accident
  merge m:1 license id using private_panel_insurance
  tab _merge
  
  sort license model start_date end_date policy
save pv1,replace

*Now we need to group together accidents
*Now all the accidents are arranged by date
*this is arranging accidents for individuals who had a few accidents in the same period
use pv1,clear
  disp _N
  sort id accident_date 
  drop counter
  gen counter=1
  replace counter=counter[_n-1]+1 if id==id[_n-1]
  tab counter
  egen max_accident=max(counter)
  gen fault=0
  replace fault=1 if damage=="at fault"
save base,replace

local stopper=max_accident[_N]
disp "`stopper'"

foreach x of numlist 2/`stopper' {
  use base,clear
    keep if counter==`x'
    rename sdate_check sdate_check`x'
    rename edate_check edate_check`x'
    rename damage_code damage_code`x'
    rename damage_estimate damage_estimate`x'
    rename damage damage`x'
    rename description description`x'
    rename location location`x'
    rename accident_city accident_city`x'
    rename accident_distance accident_distance`x'
    rename accident_date accident_date`x'
    rename fault fault`x'
    replace accident=`x'
    keep id sdate_* edate_* damage* description* location* accident* fault*
    sort id
  save accident`x',replace
}

use accident3,clear
  merge id using accident2
  tab _merge
  drop _merge
  sort id
save multi_accidents,replace
disp _N

use base,clear
  keep if counter==1
  disp _N
  sort id
  drop _merge
save upto1,replace

use multi_accidents,clear
  merge 1:1 id using upto1
  tab _merge
  drop _merge
  gen ttl_fault=fault
  tab accident
  replace ttl_fault=ttl_fault+fault2 if accident==2
  replace ttl_fault=ttl_fault+fault3 if accident==3
save private_vehicles,replace
use private_vehicles,clear
disp _N
tab accident


/*Group within company with private car and company insurance*/

insheet using company_policies.csv,clear

  summarize

  *Arranging dates
  gen s_date=date(start_date,"MDY")  
  format s_date %d
  gen e_date=date(end_date,"MDY")
  format e_date %d
  drop start_date end_date
  rename s_date start_date
  rename e_date end_date

  *this is the day we received the data
  replace end_date=date("24 april 2008","DMY") if end_date>date("24 april 2008","DMY") 

  *Again we only want drivers with full insurance
  keep if insurance==1
  disp _N
  gen policy_length=(end_date-start_date)/365.2
  summarize policy_length

  *The license variable is problematic so a car is determined by driver & model
  egen policy_life=group(client_num model)
  
  sort client_num policy start_date end_date
  merge m:1 client_num policy start_date using company_new
  tab _merge
  drop if _merge==2
  drop _merge
  gen new=0
  replace new=1 if action=="new"
  
  gsort policy_life start_date -end_date policy

  *If we have a policy that is fully included in the one before we should drop it
  *but need to make sure we are keeping the correct policy numbers for future matching
  drop if policy_life==policy_life[_n-1] & start_date>=start_date[_n-1] & end_date<=end_date[_n-1] /*
  */ & client_num!=2200366 & client_num!=2300043 & client_num!=2300064
  disp _N
  
  gsort policy_life start_date -end_date -policy
  drop if policy_life==policy_life[_n-1] & start_date>=start_date[_n-1] & end_date<=end_date[_n-1] 
 
  by policy_life, sort:egen short_life=min(policy_length)
  by policy_life,sort: egen ttlpolicy=count(license)
  tab ttlpolicy
  
save all_company,replace

use all_company,clear

  summarize short_life
  keep if short_life<0.5 & ttlpolicy>1
  disp _N

  *First policy's with 2 phases lets combine to 1
  by policy_life, sort:egen policy_start=min(start_date)
  by policy_life, sort:egen policy_end=max(end_date)
  
  sort policy_life start_date policy
  replace start_date=policy_start if ttlpolicy==2
  replace end_date=policy_end if ttlpolicy==2

  gen counter=1
  replace counter=counter[_n-1]+1 if policy_life==policy_life[_n-1]

  *adjusting policies to include a short_life - these policies have less overlap than private
  replace start_date=start_date[_n-1] if policy_life==policy_life[_n-1] & /*
    */ ttlpolicy>2 & policy_length[_n-1]==short_life 
  replace end_date=end_date[_n+1] if policy_life==policy_life[_n+1] & /*
    */ ttlpolicy>2 & policy_length==short_life
  
  replace policy_length=(end_date-start_date)/365.2
  drop short_life
  by policy_life, sort:egen short_life=min(policy_length)
 
 *Let's clean up what we fixed 
  sort policy_life start_date end_date policy
  drop if policy_life==policy_life[_n-1] /*
    */ & start_date==start_date[_n-1] & end_date==end_date[_n-1]

  drop ttlpolicy 
  by policy_life,sort: egen ttlpolicy=count(license)

  summarize policy_length

save clean,replace

use all_company,clear

  drop if short_life<0.5 & ttlpolicy>1
  append using clean
  sort policy_life start_date
  drop policy_start policy_end

  *fixing overlaps
  replace end_date=16538 if client_num==200139 & start_date==15949
  replace start_date=15949 if policy_life==399
  replace end_date=16224 if policy_life==399
  replace end_date=16642 if client_num==200463 & start_date==16315
  replace end_date=17111 if client_num==200463 & start_date==17045
  replace end_date=15474 if policy_life==734
  replace end_date=16607 if client_num==2200391 & start_date==16315
  replace end_date=16314 if client_num==200139 & start_date==15949
  replace end_date=16538 if client_num==200139 & start_date==16315
  replace end_date=16461 if client_num==200482 & start_date==16315

  *lets check for overlaps
  gen must_split=0
  sort client_num start_date policy
  replace must_split=1 if (client_num==client_num[_n-1] & start_date<end_date[_n-1])
  by client_num,sort:egen ttl_split=sum(must_split)
  tab ttl_split
  replace end_date=start_date[_n+1]-1 if must_split[_n+1]==1 & ttl_split==1  
  disp _N
  *we drop these overlaps
  drop if start_date>end_date
save panel_company,replace


tab ttlpolicy
summarize policy_length 
 

use panel_company,clear

  *phase 1 lets check if policy owners are male or female
  rename name1 name
  sort name
  merge name using names
  tab _merge
  keep if _merge==3
  drop name _merge
  rename sex owner_sex
  rename name2 name
  sort name
  merge name using names
  tab _merge
  drop if _merge==2
  drop name _merge
  rename sex driver2_sex
  rename name3 name
  sort name
  merge name using names
  drop if _merge==2
  tab _merge
  drop name _merge
  rename sex driver3_sex

  *information on residence
  sort city
  merge city using city_chars2
  tab _merge
  keep if _merge==3
  drop _merge
  disp _N
  
  *missing information on vehicle
  drop if year==0

  sort client_num policy city model insurance start_date
  gen id=_n
  sort client_num policy id
  gen accident=0
  
save company_panel_insurance,replace

insheet using company_claims.csv,names clear

  drop damage_code damage_estimate
  sort file_num client_num policy license accident_date
  merge m:1 file_num client_num policy license accident_date using class
  keep if _merge==3
  drop _merge
  disp _N
  *dealing with stata language issues
  replace location="������" if file_num==10007666 | file_num==10007281 | file_num==10001810 /*
    */ | file_num==10006857 | file_num==10011043

  sort location
  merge m:1 location using accident_loc2
  tab _merge
  drop _merge
  rename english_city accident_city

  rename start_date sdate_check
  rename end_date edate_check

  gen a_date=date(accident_date,"MDY")
  format a_date %d
  list a_date accident_date if a_date==. & accident_date!=""
  drop accident_date
  rename a_date accident_date
  sort client_num accident_date

  *These are clients whose policies was combined (they now have a new policy number)
  replace policy=220259 if client_num==201153 & policy==220768
  replace policy=213659 if client_num==2200378 & policy==212714
  replace policy=215398 if client_num==2200406 & policy==216282
  replace policy=223009 if client_num==2200480 & policy==222622
  replace policy=224601 if client_num==2300097 & policy==225350
  replace policy=100771 if client_num==201123 & policy==0
  replace policy=222387 if client_num==2300036 & policy==222996
  replace policy=211481 if client_num==2200338 & policy==220161

  keep if class==1

  gen counter=1
  replace counter=counter[_n-1]+1 if client_num==client_num[_n-1]
  egen max_accident=max(counter)
  
save company_claims,replace

use company_claims,clear 
local stopper=max_accident[_N]
disp "`stopper'"
foreach x of numlist 1/`stopper' {
  use company_claims,clear
    keep if counter==`x'
    drop counter max_accident
    sort client_num policy
    merge client_num policy using company_panel_insurance
    keep if _merge==3
    replace accident=1 if (accident_date>=start_date & accident_date<=end_date)
    keep if accident==1
    sort client_num policy
    drop _merge
  save company_panel_insurance`x',replace
  disp _N
}
*this is all the accident data
use company_panel_insurance1,clear
  append using company_panel_insurance2  
  append using company_panel_insurance3
  append using company_panel_insurance4
  append using company_panel_insurance5
  append using company_panel_insurance6
  append using company_panel_insurance7
  append using company_panel_insurance8

  tab accident

  *now we must merge with the source
  sort client_num policy id
  merge client_num policy id using company_panel_insurance
  tab _merge

save cv1,replace

*Now all the accidents are arranged by date
use cv1,clear
  sort id accident_date
  drop counter
  gen counter=1
  replace counter=counter[_n-1]+1 if id==id[_n-1]
  tab counter
  egen max_accident=max(counter)
  gen fault=0
  replace fault=1 if damage=="at fault"
save cbase,replace

local stopper=max_accident[_N]
disp "`stopper'"
foreach x of numlist 2/`stopper' {
  use cbase,clear
    keep if counter==`x'
    rename sdate_check sdate_check`x'
    rename edate_check edate_check`x'
    rename damage_code damage_code`x'
    rename damage_estimate damage_estimate`x'
    rename damage damage`x'
    rename description description`x'
    rename location location`x'
    rename accident_city accident_city`x'
    rename accident_distance accident_distance`x'
    rename accident_date accident_date`x'
    rename fault fault`x'
    replace accident=`x'
    keep id sdate_* edate_* damage* description* location* accident* fault*
    sort id
  save caccident`x',replace
}

use caccident4,clear
   merge id using caccident3
   tab _merge
   drop _merge
   sort id
   merge id using caccident2
   tab _merge
   drop _merge
   sort id
save multi_caccidents,replace
disp _N

use cbase,clear
  keep if counter==1
  disp _N
  sort id
  drop _merge
save cupto1,replace

use multi_caccidents,clear

  sort id
  merge id using cupto1
  tab _merge
  drop _merge must_split ttl_split
  tab accident
  gen company=1
 
  *need to fix remaining overlap
  *large overlap - assume additional driver
  replace client_num=2002960 if policy_life==212 | policy_life==215
  replace client_num=2000710 if policy_life==47 | policy_life==48
  replace client_num=2000711 if policy_life==46
  
  *these are overlaps that must be dropped
  drop if client_num==2000710 & start_date==17563
  drop if policy_life==428 | policy_life==98 | policy_life==735 | policy_life==1120 | policy_life==1150
  drop if client_num==2300031 & start_date==17045 & model==737244
    
save company_vehicles,replace
disp _N
tab accident

*General Cleaning
use private_vehicles,clear

  gen company=0
  append using company_vehicles  

  gen start_year=year(start_date)
  gen end_year=year(end_date)

  gen mid_year=round((start_year+end_year)/2)
  gen accident_month=month(accident_date)
  by company, sort: summarize start_year end_year mid_year accident_month
 
  replace engine=engine*100 if engine<100

  order english_city bagrut*

  gen bagrut=.
  replace bagrut=bagrut2001 if mid_year<=2001
  replace bagrut=bagrut2002 if mid_year==2002
  replace bagrut=bagrut2003 if mid_year==2003
  replace bagrut=bagrut2004 if mid_year==2004
  replace bagrut=bagrut2005 if mid_year==2005
  replace bagrut=bagrut2006 if mid_year==2006
  replace bagrut=bagrut2007 if mid_year>=2007

  *these are all in 2001 NIS
  gen avg_inc=avg_inc2001
  replace avg_inc=avg_inc2002 if mid_year==2002
  replace avg_inc=avg_inc2003 if mid_year==2003
  replace avg_inc=avg_inc2004 if mid_year==2004
  replace avg_inc=avg_inc2005 if mid_year==2005
  replace avg_inc=avg_inc2006 if mid_year>=2006

  gen winter=.
  replace winter=0 if accident_month!=.
  replace winter=1 if (accident_month>=11 | accident_month<=3) & accident_month!=.

  replace accident_city=accident_city2 if accident_city==""
  replace accident_city=accident_city3 if accident_city==""
  sort english_city accident_city
  merge m:1 english_city accident_city using residence_accidents
  tab _merge
  drop if _merge==2
  drop _merge

  gen d_accident=0
  replace d_accident=1 if accident>0
  tab accident

  gen ttl_damage=damage_estimate
  replace ttl_damage=(damage_estimate+damage_estimate2)/2 if accident==2
  replace ttl_damage=(damage_estimate+damage_estimate2+damage_estimate3)/3 if accident==3
  replace ttl_damage=(damage_estimate+damage_estimate2+damage_estimate3+damage_estimate4)/4 if accident==4

  replace accident_distance=distance if accident_city=="home"
  gen avg_dist=accident_distance
  replace avg_dist=(accident_distance+accident_distance2)/2 if accident==2
  replace avg_dist=(accident_distance+accident_distance2+accident_distance3)/3 if accident==3
  replace avg_dist=(accident_distance+accident_distance2+accident_distance3+accident_distance4)/4 if accident==4

  replace new=1 if action=="new"
  keep id license client_num policy_life damage_estimate accident_distance year engine insurance /*
    */ start_date end_date owner_sex distance bagrut* avg_inc* accident_date* accident fault  /*
    */ ttl_fault company policy_length start_year end_year mid_year winter direction /*
    */ d_accident model avg_dist ttl_damage damage_code* tot_* new tot_premium residence_accident

  summarize policy_life
  replace policy_life=policy_life+2000 if company==1

  by company, sort: summarize policy_life

  tostring id,replace
  replace id="c"+id if company==1

  gen d_internal=0
  gen d_NE=0
  gen d_NW=0
  gen d_SE=0
  gen d_SW=0
  replace d_internal=1 if direction=="I"
  replace d_NE=1 if direction=="NE"
  replace d_NW=1 if direction=="NW"
  replace d_SE=1 if direction=="SE"
  replace d_SW=1 if direction=="SW"
 
save total_collision,replace 

use total_collision,clear

  tab mid_year
  drop if end_date<start_date 
   
  by client_num,sort: egen ttl_policies=count(license)
  tab ttl_policies
  sort client_num start_date
   
  gen period=1
  replace period=period[_n-1]+1 if client_num==client_num[_n-1]
  bysort client_num:egen num_periods=max(period)
  drop if num_periods==1
   
  *Creating a mean information value on types of cars people drove
  bysort client_num:egen mean_engine=mean(engine)
  bysort client_num:egen mean_year=mean(year)
  bysort client_num:egen mean_bagrut=mean(bagrut)
  bysort client_num:egen mean_income=mean(avg_inc)

  tsset client_num period

  *Let's create a variable for whether people were insured in winter
  gen p_winter=0
  replace p_winter=1 if policy_length>=10/12
  gen start_month=month(start_date)
  gen end_month=month(end_date)
  replace p_winter=1 if start_month==1 
  replace p_winter=1 if start_month==5 & policy_length>=9/12
  replace p_winter=1 if start_month==6 & policy_length>=8/12
  replace p_winter=1 if start_month==7 & policy_length>=7/12
  replace p_winter=1 if start_month==8 & policy_length>=6/12
  replace p_winter=1 if start_month==9 & policy_length>=5/12
  replace p_winter=1 if start_month==10 & policy_length>=4/12
  replace p_winter=1 if start_month==11 | start_month==12 
  tabulate period, generate(per)

  *Let's create a variable for the total length of time people were insured
  bysort client_num:egen client_start=min(start_date)
  bysort client_num:egen client_end=max(end_date)
  gen dur_client=client_end-client_start
  gen dur_client_y=dur_client/365.2

  drop fault
  gen fault=0
  replace fault=1 if damage_code==2 | damage_code2==2 | damage_code3==2 | damage_code4==2
   
  by client_num: gen daccident1=d_accident[1]
  by client_num: gen length1=policy_length[1]
  bysort client_num (period): gen daccident_l=L.d_accident

  gen year_04=0
  gen year_05=0
  gen year_06=0
  gen year_07=0 

  replace year_04=1 if year(start_date)==2004 | year(end_date)==2004
  replace year_05=1 if year(start_date)==2005 | year(end_date)==2005
  replace year_06=1 if year(start_date)==2006 | year(end_date)==2006
  replace year_07=1 if year(start_date)>=2007 | year(end_date)>=2007
   
  replace policy_length=(end_date-start_date)/365.2
  
  sort client_num start_date end_date
save weisburd_collisions,replace   

insheet using parking.csv, clear
  keep license accident_date parking
  gen date=date(accident_date,"MDY")
  drop accident_date
  rename date accident_date 
  format accident_date %d
  sort license accident_date
save parking, replace
  
insheet using car_models_byyear.csv, clear
  drop id
  sort start_year model year
  keep if car_value!=.
  drop if start_year==start_year[_n-1] & model==model[_n-1] & year==year[_n-1]
save models,replace
  keep if start_year==2007
  replace start_year=2008
  sort start_year model year
save models_08,replace

use weisburd_collisions,clear

  *let's merge with data on whether this accident was a parking accident
  sort license accident_date
  merge license accident_date using parking
  drop if _merge==2
  tab parking
  drop _merge
  replace parking=0 if parking==.
  
  sort start_year model year
  merge start_year model year using models
  tab _merge
  drop if _merge==2
  drop _merge
  summarize start_year if car_value==.
  sort start_year model year
  merge m:1 start_year model year using models_08,update
  tab _merge
  drop if _merge==2
  by company, sort: summarize car_value
  ttest car_value, by(company) 

  gen cpi=1
  replace cpi=1.08 if start_year==2002
  replace cpi=1.06 if start_year==2003
  replace cpi=1.07 if start_year==2004
  replace cpi=1.08 if start_year==2005
  replace cpi=1.11 if start_year==2006
  replace cpi=1.12 if start_year>=2007

  gen car_value01=car_value/cpi
  replace car_value01=car_value01/4.41
  by client_num, sort: egen mean_value=mean(car_value01)

  by company, sort: summarize car_value01
  ttest car_value01, by(company) 

  summarize tot_premium if company==0
  replace tot_premium=. if tot_premium<=0
  summarize tot_premium,d

  sort client_num start_date end_date

  *fill in missing values of premium
  replace tot_premium=tot_premium[_n-1] if tot_premium==. & client_num==client_num[_n-1]
  replace tot_premium=tot_premium[_n+1] if tot_premium==. & client_num==client_num[_n+1] 
  by start_year,sort: egen avg_premium=mean(tot_premium)
  replace tot_premium=avg_premium if company==0 & tot_premium==.

  gen c_insurance=.
  replace c_insurance=(0.1*2.36*tot_premium+0.2*2.36*tot_premium*daccident_l+160*4.41)/cpi if company==0
  replace c_insurance=160*4.41*0.75 if company==1
  replace c_insurance=c_insurance/4.41
  
  gen c_insurance_min=.
  replace c_insurance_min=(0.1*3*0.473*tot_premium+0.2*3*0.473*tot_premium*daccident_l+160*4.41)/cpi if company==0
  replace c_insurance_min=160*4.41*0.75 if company==1
  replace c_insurance_min=c_insurance_min/4.41

  gen c_insurance_max=.
  replace c_insurance_max=(0.1*3*tot_premium+0.2*3*tot_premium*daccident_l+160*4.41)/cpi if company==0
  replace c_insurance_max=160*4.41*0.75 if company==1
  replace c_insurance_max=c_insurance_min/4.41

  by period,sort: summarize c_insurance if company==0
  
  replace tot_premium=tot_premium/4.41
  replace avg_inc=avg_inc/4.41
  replace tot_premium=0 if company==1
  
  egen fid=group(client_num)
  
  keep bagrut avg_inc owner_sex residence_accident distance d_* avg_inc bagrut engine year tot_premium car_value01 /*
    */ c_insurance winter accident start_date end_date company period year fid parking /*
    */ new year_0* d_N* d_S* policy_length p_winter mean_* daccident1 c_insurance_min c_insurance_max fault
		
  
save weisburd_restat,replace


tab company
summarize policy_length
sort fid
drop if fid==fid[_n-1]
tab company
 
 
log close
